Data

Which years are missing a metric

dat %>% 
  ggplot(aes(year, lakeid, fill=is.na(daynum))) +
  geom_tile(color="grey") +
  theme_bw() +
  facet_wrap(~metric, nrow=6) 

Remaining data issues:

  • Northern lakes don’t have DOC until until 1986; missing first 4 years when all other variables are available. Potential fixes:
    • Exclude DOC; start analyses in 1986; fill DOC with e.g., median value, or model of other variables?
  • Southern lakes missing chlorophyll data: 1996 - 1999 and 2002 - 2004; 2002-04 is bad/uncalibrated values, used DOY but not magnitude; earlier years fill with median
  • Zoop data issues:
    • CB missing daphnia peak 1995(?): fill with median
    • FI missing several years from 1996 - 2006; fill with median

Predict DOC daynum from other variable daynum’s in each northern lake

no_dups = dat %>% group_by(lakeid, year, metric) %>% summarise(N = n()) %>% filter(N == 1)
## `summarise()` has grouped output by 'lakeid', 'year'. You can override using
## the `.groups` argument.
n_lakes_wide = left_join(no_dups, dat) %>% 
  select(-sampledate) %>% 
  pivot_wider(names_from = metric, values_from = daynum) %>% 
  filter(lakeid %in% c("AL", "BM", "CB", "CR", "SP", "TB", "TR"))
## Joining, by = c("lakeid", "year", "metric")
hold_data = na.omit(data.frame(n_lakes_wide %>%  ungroup() %>% select(-year, -N))) # , -doc_predict
all = lm(doc_epiMax~(iceon+straton+stratoff+energy+
                 stability+anoxia_summer+
                 secchi_max)*lakeid, data=hold_data)
i_o = lm(doc_epiMax~1, data=hold_data)
hold_step = step(i_o, scope=formula(all))
## Start:  AIC=1761.44
## doc_epiMax ~ 1
## 
##                 Df Sum of Sq    RSS    AIC
## + lakeid         6     74137 395186 1733.7
## + stratoff       1     15998 453325 1755.4
## <none>                       469323 1761.4
## + secchi_max     1      1334 467989 1762.8
## + iceon          1       565 468758 1763.2
## + straton        1       551 468772 1763.2
## + energy         1       305 469018 1763.3
## + anoxia_summer  1       234 469089 1763.3
## + stability      1        65 469258 1763.4
## 
## Step:  AIC=1733.72
## doc_epiMax ~ lakeid
## 
##                 Df Sum of Sq    RSS    AIC
## + stratoff       1      6524 388662 1731.9
## <none>                       395186 1733.7
## + energy         1      2484 392702 1734.3
## + anoxia_summer  1      2400 392786 1734.3
## + iceon          1       595 394591 1735.4
## + straton        1       121 395065 1735.7
## + secchi_max     1        60 395126 1735.7
## + stability      1        56 395130 1735.7
## - lakeid         6     74137 469323 1761.4
## 
## Step:  AIC=1731.88
## doc_epiMax ~ lakeid + stratoff
## 
##                   Df Sum of Sq    RSS    AIC
## <none>                         388662 1731.9
## + energy           1      3261 385401 1731.9
## + anoxia_summer    1      2560 386102 1732.3
## + iceon            1       407 388255 1733.6
## - stratoff         1      6524 395186 1733.7
## + straton          1       202 388460 1733.8
## + stability        1       183 388479 1733.8
## + secchi_max       1        21 388640 1733.9
## + stratoff:lakeid  6     11160 377502 1737.2
## - lakeid           6     64663 453325 1755.4
doc_model = lm(doc_epiMax~(iceon+straton+stratoff+energy+
                 stability+anoxia_summer+
                 secchi_max)*lakeid, 
               data=n_lakes_wide, 
               na.action = na.exclude)
summary(doc_model)
## 
## Call:
## lm(formula = doc_epiMax ~ (iceon + straton + stratoff + energy + 
##     stability + anoxia_summer + secchi_max) * lakeid, data = n_lakes_wide, 
##     na.action = na.exclude)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -150.321  -17.502    3.039   23.126  119.066 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)  
## (Intercept)              64.83950  170.68959   0.380   0.7045  
## iceon                     0.22703    0.36262   0.626   0.5321  
## straton                   0.15476    0.36054   0.429   0.6683  
## stratoff                  0.51902    0.31510   1.647   0.1013  
## energy                   -0.17673    0.22397  -0.789   0.4311  
## stability                -0.12534    0.14804  -0.847   0.3983  
## anoxia_summer            -0.03987    0.17586  -0.227   0.8209  
## secchi_max               -0.07954    0.08274  -0.961   0.3376  
## lakeid.L                319.71870  449.12342   0.712   0.4775  
## lakeid.Q                395.98906  440.10607   0.900   0.3695  
## lakeid.C                235.35696  455.90738   0.516   0.6063  
## lakeid^4                 87.51056  456.38546   0.192   0.8482  
## lakeid^5               -416.46790  462.59187  -0.900   0.3692  
## lakeid^6                352.77072  445.11973   0.793   0.4291  
## iceon:lakeid.L           -1.09045    0.95880  -1.137   0.2569  
## iceon:lakeid.Q           -0.53411    0.94683  -0.564   0.5734  
## iceon:lakeid.C           -0.47248    0.95370  -0.495   0.6209  
## iceon:lakeid^4            0.09223    0.99242   0.093   0.9261  
## iceon:lakeid^5            0.44611    0.94858   0.470   0.6387  
## iceon:lakeid^6           -0.10044    0.95529  -0.105   0.9164  
## straton:lakeid.L         -0.99644    0.96543  -1.032   0.3034  
## straton:lakeid.Q          0.14198    0.87945   0.161   0.8719  
## straton:lakeid.C          0.21534    0.97792   0.220   0.8260  
## straton:lakeid^4          0.79795    1.02354   0.780   0.4367  
## straton:lakeid^5          1.52519    0.99024   1.540   0.1253  
## straton:lakeid^6          0.71254    0.87728   0.812   0.4177  
## stratoff:lakeid.L        -0.22517    0.84120  -0.268   0.7893  
## stratoff:lakeid.Q        -0.50789    0.75106  -0.676   0.4998  
## stratoff:lakeid.C         0.17399    0.84436   0.206   0.8370  
## stratoff:lakeid^4        -0.26699    0.93880  -0.284   0.7764  
## stratoff:lakeid^5         0.87622    0.84751   1.034   0.3026  
## stratoff:lakeid^6        -1.57643    0.76545  -2.059   0.0409 *
## energy:lakeid.L           0.35302    0.59611   0.592   0.5545  
## energy:lakeid.Q          -0.39372    0.55165  -0.714   0.4763  
## energy:lakeid.C          -0.19354    0.59663  -0.324   0.7460  
## energy:lakeid^4          -0.13369    0.64901  -0.206   0.8370  
## energy:lakeid^5          -0.43207    0.59716  -0.724   0.4703  
## energy:lakeid^6          -0.02438    0.55976  -0.044   0.9653  
## stability:lakeid.L       -0.07321    0.34484  -0.212   0.8321  
## stability:lakeid.Q        0.17086    0.37799   0.452   0.6518  
## stability:lakeid.C       -0.11656    0.39559  -0.295   0.7686  
## stability:lakeid^4       -0.39170    0.34527  -1.134   0.2581  
## stability:lakeid^5       -0.14418    0.44054  -0.327   0.7438  
## stability:lakeid^6       -0.13469    0.43468  -0.310   0.7570  
## anoxia_summer:lakeid.L    0.29492    0.47554   0.620   0.5359  
## anoxia_summer:lakeid.Q    0.16199    0.46703   0.347   0.7291  
## anoxia_summer:lakeid.C   -0.07344    0.45279  -0.162   0.8713  
## anoxia_summer:lakeid^4   -0.19354    0.50427  -0.384   0.7016  
## anoxia_summer:lakeid^5   -0.16585    0.42883  -0.387   0.6994  
## anoxia_summer:lakeid^6    0.05221    0.45990   0.114   0.9097  
## secchi_max:lakeid.L       0.50747    0.23435   2.165   0.0317 *
## secchi_max:lakeid.Q      -0.42967    0.23513  -1.827   0.0693 .
## secchi_max:lakeid.C      -0.17325    0.22503  -0.770   0.4424  
## secchi_max:lakeid^4      -0.08336    0.19516  -0.427   0.6698  
## secchi_max:lakeid^5      -0.14772    0.21531  -0.686   0.4935  
## secchi_max:lakeid^6       0.15293    0.20545   0.744   0.4576  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 42.84 on 180 degrees of freedom
##   (37 observations deleted due to missingness)
## Multiple R-squared:  0.2977, Adjusted R-squared:  0.08308 
## F-statistic: 1.387 on 55 and 180 DF,  p-value: 0.05718
n_lakes_wide$doc_predict = predict(doc_model, n_lakes_wide)

cor = n_lakes_wide %>% 
  group_by(lakeid) %>% 
  summarise(r = paste("r =", round(cor(doc_epiMax, doc_predict, use="complete.obs"), 3)))

n_lakes_wide %>% 
  ggplot(aes(x=doc_epiMax, y=doc_predict, color=as.character(lakeid))) +
  geom_point()+
  theme_bw() +
  labs(color="lakeid")+
  geom_abline(slope=1, intercept = 0) + 
  facet_wrap(~lakeid) + 
  geom_text(aes(label=r), data=cor, x=300, y=0) +
  labs(x="obs peak DOC (daynum)",  y="modeled peak DOC (daynum)")
## Warning: Removed 37 rows containing missing values (`geom_point()`).

# not good but vaguely positive relationship; use this instead of median?
missing_doc = n_lakes_wide %>% filter(is.na(doc_epiMax))

missing_doc$doc_fill = round(predict(doc_model, missing_doc))

doc_fill = missing_doc %>% 
  select(lakeid, year, doc_fill) %>% 
  rename(daynum_fill = doc_fill) %>% 
  mutate(metric = "doc_epiMax") %>% 
  filter(year < 2000)

Create filled metric column and filled

all_fill = bind_rows(doc_fill)
dat_filled = full_join(dat, all_fill) %>% 
  mutate(filled_flag = ifelse(is.na(daynum) & !is.na(daynum_fill), TRUE, FALSE)) %>% 
  mutate(daynum_fill = ifelse(is.na(daynum) & !is.na(daynum_fill), daynum_fill, daynum))
## Joining, by = c("lakeid", "metric", "year")
dat_filled$lakeid = factor(dat_filled$lakeid, 
                    levels = c("AL", "BM", "CB", "CR", "SP", "TB", "TR", "FI", "ME", "MO", "WI"), 
                    ordered = T)
vars = c("iceoff", "straton", "secchi_all", "secchi_max","secchi_min", "zoopDensity","zoopDensity_CC",
        "doc_epiMax","totpuf_epiMax", "totpuf_epiMin", "totpuf_hypoMax", "totpuf_hypoMin", 
        "anoxia", "anoxia_summer", "stability", "energy", "stratoff", "iceon")
dat_filled$metric = factor(dat_filled$metric,
                    levels = rev(vars),
                    ordered=T)

dat_filled %>% 
  ggplot(aes(year, metric, fill=filled_flag)) +
  geom_tile(color="grey") +
  theme_bw() +
  facet_wrap(~lakeid, nrow=6)

## Additional trimming/filling

Trim to “good” years, fill FI ice data with Monona, then fill additional missing with median values for each lake/metric

df_yearsWant = dat_filled %>% 
  filter((lakeid %in% c("FI", "ME", "MO", "WI") & year %in% 1996:2018) |
           (!(lakeid %in% c("FI", "ME", "MO", "WI")) & year %in% 1982:2018))

fi_icefill = df_yearsWant %>% 
  filter(lakeid == "MO" & metric %in% c("iceoff", "iceon")) %>% 
  mutate(lakeid = "FI") %>% 
  select(-daynum, -filled_flag, -sampledate) %>% 
  rename(icefill = daynum_fill)

df_yearsWant = df_yearsWant %>% 
  full_join(fi_icefill) %>% 
  mutate(daynum_fill = ifelse(is.na(daynum_fill) & !is.na(icefill), 
                              icefill, daynum),
         filled_flag = ifelse(is.na(daynum) & !is.na(daynum_fill) & !is.na(icefill), 
                              TRUE, filled_flag))
## Joining, by = c("lakeid", "metric", "year")
df_yearsWant %>% 
  ggplot(aes(year, metric, fill=filled_flag)) +
  geom_tile(color="grey") +
  theme_bw() +
  facet_wrap(~lakeid, nrow=6)

Final fill w/ medians

all_lys = df_yearsWant %>% 
  select(lakeid, year) %>% 
  distinct()

metric = unique(df_yearsWant$metric)

all_lys_want = expand_grid(all_lys, metric)

dat_final = left_join(all_lys_want, df_yearsWant)
## Joining, by = c("lakeid", "year", "metric")
medians = dat_final %>% 
  group_by(lakeid, metric) %>% 
  summarise(median_daynum = median(daynum_fill, na.rm=T))
## `summarise()` has grouped output by 'lakeid'. You can override using the
## `.groups` argument.
dat_final = dat_final %>% 
  left_join(medians) %>% 
  mutate(daynum_fill = ifelse(is.na(daynum_fill), median_daynum, daynum_fill)) %>% 
  mutate(filled_flag = ifelse(is.na(daynum) & !is.na(daynum_fill), TRUE, filled_flag)) %>% 
  select(-icefill, -median_daynum)
## Joining, by = c("lakeid", "metric")
dat_final %>%  
  ggplot(aes(year, metric, fill=filled_flag)) +
  geom_tile(color="grey") +
  theme_bw() +
  facet_wrap(~lakeid, nrow=6)

write_csv(dat_final, "Data/analysis_ready/final_combined_dates_filled_v2.csv")